"""Workbook to analyse encode predictions.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, too-many-branches, duplicate-code
'Workbook to analyse encode predictions.\n'
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
from __future__ import annotations
import copy
import functools
import gc
from pathlib import Path
from typing import Dict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display
from sklearn.metrics import confusion_matrix, f1_score
from epi_ml.core.confusion_matrix import ConfusionMatrixWriter
from epi_ml.utils.classification_merging_utils import merge_dataframes
from epi_ml.utils.notebooks.paper.metrics_per_assay import MetricsPerAssay
from epi_ml.utils.notebooks.paper.paper_utilities import (
ASSAY,
ASSAY_ORDER,
CELL_TYPE,
EPIATLAS_16_CT,
LIFE_STAGE,
SEX,
IHECColorMap,
MetadataHandler,
SplitResultsHandler,
set_file_id,
)
CANCER = "harmonized_sample_cancer_high"
BIOMAT = "harmonized_biomaterial_type"
CORE_ASSAYS = ASSAY_ORDER[0:7]
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
table_dir = base_dir / "tables"
paper_dir = base_dir
if not base_fig_dir.exists():
raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")
metadata_handler = MetadataHandler(paper_dir)
split_results_handler = SplitResultsHandler()
IHECColorMap = IHECColorMap(base_fig_dir)
assay_colors = IHECColorMap.assay_color_map
encode_metadata_dir = base_data_dir / "metadata" / "encode"
encode_predictions_dir = base_data_dir / "training_results" / "predictions" / "encode"
for path in [encode_metadata_dir, encode_predictions_dir]:
if not path.exists():
raise FileNotFoundError(f"Directory {path} does not exist.")
Was created in encode_metadata_creation.ipynb using new web API downloads.
full_metadata_path = (
encode_metadata_dir / "new_meta" / "encode_full_metadata_2025-02_no_revoked.csv"
)
complete_metadata_df = pd.read_csv(full_metadata_path, low_memory=False)
5 categories value counts
for cat in [
ASSAY,
CELL_TYPE,
"cell_type",
"epiclass_sample_ontology",
"donor_life_stage",
"donor_sex",
"cancer_status",
BIOMAT,
]:
try:
print(complete_metadata_df[cat].value_counts(dropna=False), "\n")
except KeyError:
print(f"Column {cat} not found.")
assay_epiclass non-core 3593 input 1981 rna_seq 1188 h3k4me3 705 h3k27ac 623 mrna_seq 604 h3k27me3 585 h3k4me1 556 h3k36me3 554 h3k9me3 536 ctcf 468 wgbs 252 Name: count, dtype: int64 harmonized_sample_ontology_intermediate NaN 2559 epithelial cell derived cell line 1886 lymphoma or leukaemia cell line 1354 cancer cell line 797 mesoderm-derived structure 764 T cell 480 stem cell derived cell line 399 digestive system 339 B cell derived cell line 298 endoderm-derived structure 255 colon 174 brain 159 connective tissue cell 152 muscle organ 148 neural cell 138 pancreas 116 fibroblast derived cell line 114 mucosa 113 lymphocyte of B lineage 107 epithelium 98 liver 85 keratinocyte 83 monocyte 81 embryonic cell (metazoa) 73 meso-epithelial cell 67 muscle cell 62 placenta 58 nervous system 55 hematopoietic cell 49 lung 48 natural killer cell 46 ovary 44 mammary gland epithelial cell 43 suprapubic skin 42 muscle precursor cell 39 peripheral blood mononuclear cell 34 uterus 32 extraembryonic cell 31 kidney 30 testis 29 neural progenitor cell 28 melanocyte 24 hepatocyte 22 trophoblast 20 KMS-11 16 neutrophil 13 secretory cell 10 stem cell 10 kidney epithelial cell 7 endo-epithelial cell 7 erythroid lineage cell 5 mole 2 Name: count, dtype: int64 Column cell_type not found. epiclass_sample_ontology NaN 2559 epithelial cell derived cell line 1886 lymphoma or leukaemia cell line 1354 cancer cell line 797 mesoderm-derived structure 764 T cell 480 stem cell derived cell line 399 digestive system 339 B cell derived cell line 298 endoderm-derived structure 255 colon 174 brain 159 connective tissue cell 152 muscle organ 148 neural cell 138 pancreas 116 fibroblast derived cell line 114 mucosa 113 lymphocyte of B lineage 107 epithelium 98 liver 85 keratinocyte 83 monocyte 81 embryonic cell (metazoa) 73 meso-epithelial cell 67 muscle cell 62 placenta 58 nervous system 55 hematopoietic cell 49 lung 48 natural killer cell 46 ovary 44 mammary gland epithelial cell 43 suprapubic skin 42 muscle precursor cell 39 peripheral blood mononuclear cell 34 uterus 32 extraembryonic cell 31 kidney 30 testis 29 neural progenitor cell 28 melanocyte 24 hepatocyte 22 trophoblast 20 KMS-11 16 neutrophil 13 secretory cell 10 stem cell 10 kidney epithelial cell 7 endo-epithelial cell 7 erythroid lineage cell 5 mole 2 Name: count, dtype: int64 Column donor_life_stage not found. Column donor_sex not found. Column cancer_status not found. harmonized_biomaterial_type cell line 5537 primary tissue 3611 primary cell 1856 other 600 NaN 41 Name: count, dtype: int64
if "cancer_status" not in complete_metadata_df.columns:
chip_path = (
encode_metadata_dir / "old_meta" / "encode_metadata_2023-10-25_clean-v2.csv"
)
chip_metadata_df = pd.read_csv(chip_path, low_memory=False)
cancer_status = chip_metadata_df[["md5sum", "cancer_status"]]
complete_metadata_df = complete_metadata_df.merge(
cancer_status, how="left", left_on="FILE_accession", right_on="md5sum"
)
complete_metadata_df.drop(columns="md5sum", inplace=True)
del chip_metadata_df
gc.collect()
chip_pred_dfs = {}
nb_chip_files = 9619
for folder in encode_predictions_dir.glob("*1l_3000n"):
if not folder.is_dir():
continue
cat = folder.name.split("_1l_3000n")[0] # [category]_1l_3000n
pred_file = list(folder.rglob("complete_no_valid_oversample*chip.csv"))[0]
encode_df = pd.read_csv(pred_file)
encode_df = set_file_id(encode_df)
chip_pred_dfs[cat] = encode_df
print(cat, encode_df.shape)
assert encode_df.shape[0] == nb_chip_files
if len(chip_pred_dfs) != 6:
raise ValueError(f"{len(chip_pred_dfs)} != 6")
harmonized_biomaterial_type (9619, 7) harmonized_donor_life_stage (9619, 8) harmonized_sample_ontology_intermediate (9619, 19) assay_epiclass (9619, 14) harmonized_donor_sex (9619, 6) harmonized_sample_cancer_high (9619, 5)
pred_dfs_rna = {}
nb_rna_files = 1790
for folder in encode_predictions_dir.glob("*1l_3000n"):
if not folder.is_dir():
continue
cat = folder.name.split("_1l_3000n")[0] # [category]_1l_3000n
pred_file = list(folder.rglob("complete_no_valid_oversample*rna.csv"))[0]
encode_df = pd.read_csv(pred_file)
encode_df = set_file_id(encode_df)
pred_dfs_rna[cat] = encode_df
print(cat, encode_df.shape)
assert encode_df.shape[0] == nb_rna_files
if len(pred_dfs_rna) != 6:
raise ValueError(f"{len(pred_dfs_rna)} != 6")
harmonized_biomaterial_type (1790, 7) harmonized_donor_life_stage (1790, 8) harmonized_sample_ontology_intermediate (1790, 19) assay_epiclass (1790, 14) harmonized_donor_sex (1790, 6) harmonized_sample_cancer_high (1790, 5)
pred_dfs_wgb = {}
nb_wgb_files = 252
for folder in encode_predictions_dir.glob("*1l_3000n"):
if not folder.is_dir():
continue
cat = folder.name.split("_1l_3000n")[0] # [category]_1l_3000n
pred_file = list(folder.rglob("complete_no_valid_oversample*wgbs.csv"))[0]
encode_df = pd.read_csv(pred_file)
encode_df = set_file_id(encode_df)
pred_dfs_wgb[cat] = encode_df
print(cat, encode_df.shape)
assert encode_df.shape[0] == nb_wgb_files
if len(pred_dfs_wgb) != 6:
raise ValueError(f"{len(pred_dfs_wgb)} != 6")
harmonized_biomaterial_type (252, 7) harmonized_donor_life_stage (252, 8) harmonized_sample_ontology_intermediate (252, 19) assay_epiclass (252, 14) harmonized_donor_sex (252, 6) harmonized_sample_cancer_high (252, 5)
for pred_dfs in [chip_pred_dfs, pred_dfs_rna, pred_dfs_wgb]:
pred_dfs[f"{ASSAY}_11c"] = pred_dfs[ASSAY]
del pred_dfs[ASSAY]
concat_pred_dfs = {}
for category in [f"{ASSAY}_11c", CELL_TYPE, SEX, LIFE_STAGE, CANCER, BIOMAT]:
concat_pred_dfs[category] = pd.concat(
[
chip_pred_dfs[category],
pred_dfs_rna[category],
pred_dfs_wgb[category],
]
)
assay_7c_path = list(
(encode_predictions_dir / "assay_epiclass_1l_3000n" / "7c").glob(
"**/complete_no_valid_oversample*.csv"
)
)
assert len(assay_7c_path) == 1
assay_7c_df = pd.read_csv(assay_7c_path[0])
assay_7c_df = set_file_id(assay_7c_df)
concat_pred_dfs[f"{ASSAY}_7c"] = assay_7c_df
same_col_names = 2
# Make all different columns have unique relevant names except for the pred vector
for cat, df in concat_pred_dfs.items():
try:
df.drop(columns=["Same?"], inplace=True)
except KeyError:
pass
df.insert(3, "Max pred", value=df.iloc[:, 1 + same_col_names :].max(axis=1))
if cat in [f"{ASSAY}_11c", f"{ASSAY}_7c"]:
old_names = df.columns[1:]
else:
old_names = df.columns[1 : 2 + same_col_names]
new_names = [f"{old_name} ({cat})" for old_name in old_names]
df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
df.set_index("md5sum", inplace=True)
concat_pred_dfs[cat] = df
df_order = [f"{ASSAY}_7c", f"{ASSAY}_11c", CELL_TYPE, SEX, LIFE_STAGE, CANCER, BIOMAT]
df_list = [concat_pred_dfs[cat] for cat in df_order]
full_merged_df = functools.reduce(merge_dataframes, df_list)
preds_plus_metadata_df: pd.DataFrame = full_merged_df.merge(
complete_metadata_df,
left_index=True,
right_on="FILE_accession",
how="left",
suffixes=("", "_delete"),
)
for col in preds_plus_metadata_df.columns:
if col.endswith("_delete"):
print(col)
preds_plus_metadata_df.drop(columns=col, inplace=True)
assert isinstance(preds_plus_metadata_df, pd.DataFrame) # pylance being weird
meta_col_order = [
col for col in complete_metadata_df.columns if col in preds_plus_metadata_df.columns
]
results_col_order = [
col for col in full_merged_df.columns if col in preds_plus_metadata_df.columns
]
new_order = results_col_order + meta_col_order
preds_plus_metadata_df = preds_plus_metadata_df.loc[:, new_order]
for pairs in [
(SEX, SEX),
(LIFE_STAGE, LIFE_STAGE),
(CANCER, "cancer_status"),
(CELL_TYPE, "epiclass_sample_ontology"),
(BIOMAT, BIOMAT),
]:
name1 = f"True class ({pairs[0]})"
name2 = pairs[1]
print(name1, name2)
preds_plus_metadata_df[name1] = preds_plus_metadata_df[name2]
preds_plus_metadata_df[pairs[0]] = preds_plus_metadata_df[pairs[1]]
True class (harmonized_donor_sex) harmonized_donor_sex True class (harmonized_donor_life_stage) harmonized_donor_life_stage True class (harmonized_sample_cancer_high) cancer_status True class (harmonized_sample_ontology_intermediate) epiclass_sample_ontology True class (harmonized_biomaterial_type) harmonized_biomaterial_type
preds_plus_metadata_df[f"True class ({ASSAY}_7c)"] = preds_plus_metadata_df[ASSAY]
preds_plus_metadata_df[f"True class ({ASSAY}_11c)"] = preds_plus_metadata_df[ASSAY]
Removing revoked files (no available metadata)
preds_plus_metadata_df = preds_plus_metadata_df[
~preds_plus_metadata_df["in_epiatlas"].isna()
]
print(preds_plus_metadata_df.shape)
(11643, 276)
logdir = base_data_dir / "training_results" / "predictions" / "encode"
filename = logdir / "complete_encode_predictions_augmented_2025-02_metadata.csv.gz"
preds_plus_metadata_df.to_csv(filename, index=False, compression="gzip")
Remove datasets overlapping with EpiATLAS
all_preds_no_epiatlas = preds_plus_metadata_df[
~preds_plus_metadata_df["in_epiatlas"].astype(bool)
]
print(all_preds_no_epiatlas.shape)
(8777, 276)
all_preds_no_epiatlas[ASSAY].value_counts(dropna=False)
assay_epiclass non-core 3593 input 1284 rna_seq 948 mrna_seq 604 ctcf 468 h3k4me3 379 h3k27ac 366 h3k27me3 294 h3k4me1 257 h3k36me3 245 h3k9me3 241 wgbs 98 Name: count, dtype: int64
cell_type_df = preds_plus_metadata_df.copy(deep=True)
cell_type_df = cell_type_df[cell_type_df[CELL_TYPE].isin(EPIATLAS_16_CT)]
print(cell_type_df.shape)
display(cell_type_df[CELL_TYPE].value_counts(dropna=False))
(1842, 276)
harmonized_sample_ontology_intermediate mesoderm-derived structure 764 endoderm-derived structure 255 colon 174 brain 159 connective tissue cell 152 muscle organ 148 monocyte 81 mammary gland epithelial cell 43 extraembryonic cell 31 hepatocyte 22 neutrophil 13 Name: count, dtype: int64
core_assays = ASSAY_ORDER + ["mrna_seq"]
cell_type_core_df = cell_type_df[cell_type_df[ASSAY].isin(core_assays)]
cell_type_noncore_df = cell_type_df[~cell_type_df[ASSAY].isin(core_assays)]
for df in [cell_type_core_df, cell_type_noncore_df]:
print(df.shape)
N = df.shape[0]
display(df[CELL_TYPE].value_counts(dropna=False))
display(df["assay"].value_counts(dropna=False))
(1580, 276)
harmonized_sample_ontology_intermediate mesoderm-derived structure 685 endoderm-derived structure 215 brain 149 colon 141 muscle organ 137 connective tissue cell 105 monocyte 73 mammary gland epithelial cell 35 extraembryonic cell 15 hepatocyte 13 neutrophil 12 Name: count, dtype: int64
assay input 281 rna_seq 192 h3k4me3 164 h3k4me1 153 h3k36me3 152 h3k9me3 146 h3k27ac 142 h3k27me3 138 mrna_seq 128 wgbs 84 Name: count, dtype: int64
(262, 276)
harmonized_sample_ontology_intermediate mesoderm-derived structure 79 connective tissue cell 47 endoderm-derived structure 40 colon 33 extraembryonic cell 16 muscle organ 11 brain 10 hepatocyte 9 monocyte 8 mammary gland epithelial cell 8 neutrophil 1 Name: count, dtype: int64
assay ctcf 116 polr2a 29 h3k9ac 26 polr2aphosphos5 16 h3k4me2 9 ep300 9 h2afz 8 h3k79me2 7 h4k20me1 6 ezh2 4 h2bk12ac 3 h3k14ac 3 h3k18ac 3 h3k4ac 3 h3k23ac 3 h4k8ac 3 h3k79me1 2 h2bk120ac 2 h4k91ac 2 h2bk5ac 2 h2ak5ac 2 ezh2phosphot487 1 h2bk15ac 1 h3k9me2 1 h4k12ac 1 Name: count, dtype: int64
for df, name in zip([cell_type_core_df, cell_type_noncore_df], ["core", "noncore"]):
print(name)
full_N = df.shape[0]
for min_pred in [0, 0.6, 0.8, 0.9]:
acc, f1, N = MetricsPerAssay.compute_metrics(
df=df, min_pred=min_pred, cat_label=CELL_TYPE
)
print(
f"Min pred: {min_pred}, N: {N} ({N/full_N:.2%}), Acc: {acc:.3f}, F1: {f1:.3f}"
)
print()
core Min pred: 0, N: 1580 (100.00%), Acc: 0.873, F1: 0.564 Min pred: 0.6, N: 1292 (81.77%), Acc: 0.966, F1: 0.836 Min pred: 0.8, N: 1155 (73.10%), Acc: 0.973, F1: 0.921 Min pred: 0.9, N: 998 (63.16%), Acc: 0.979, F1: 0.921 noncore Min pred: 0, N: 262 (100.00%), Acc: 0.851, F1: 0.657 Min pred: 0.6, N: 173 (66.03%), Acc: 0.971, F1: 0.877 Min pred: 0.8, N: 130 (49.62%), Acc: 0.992, F1: 0.967 Min pred: 0.9, N: 107 (40.84%), Acc: 1.000, F1: 1.000
Includes cell type classifiers trained with single assays. (e.g. only h3k4me1 files)
pred_folder = (
base_data_dir
/ f"training_results/dfreeze_v2/hg38_100kb_all_none/{CELL_TYPE}_1l_3000n/complete-no_valid-oversampling"
)
if not pred_folder.exists():
raise FileNotFoundError(f"Directory {pred_folder} does not exist.")
other_ct_dfs = {}
for folder in pred_folder.glob("*"):
if not folder.is_dir():
print(f"Skipping {folder}")
continue
pred_file = list(folder.glob("predictions/*.csv"))
if len(pred_file) > 1:
print(f"More than one prediction file found in {folder}")
continue
if len(pred_file) == 0:
print(f"No prediction file found in {folder}")
continue
pred_file = pred_file[0]
pred_df = pd.read_csv(pred_file)
name = folder.name.replace("complete_no_valid_oversample_", "")
for col in ["True class", "Predicted class"]:
pred_df[col] = pred_df[col].str.lower()
other_ct_dfs[name] = pred_df
other_ct_dfs.keys()
dict_keys(['h3k4me3_only', 'assay11-ct16', 'h3k9me3_only', 'h3k36me3_only', 'assay7-ct16', 'h3k27ac_only', 'input_only', 'h3k27me3_only', 'h3k4me1_only', 'assay7-ct19'])
def compute_cell_type_acc(
metadata_df: pd.DataFrame,
pred_dfs_dict: Dict[str, pd.DataFrame],
min_pred: float = 0.6,
) -> None:
"""Compute the accuracy of the predictions for the 16 cell types.
Inner meger of the metadata and predictions is performed.
"""
meta_df = metadata_df[metadata_df[CELL_TYPE].isin(EPIATLAS_16_CT)].copy()
# print("Assay counts for 16 cell types")
# values_count = meta_df["Assay"].value_counts(dropna=False)
# display(values_count)
# display_perc(values_count / values_count.sum() * 100)
# print("Cell types distribution")
# values_count = meta_df[CELL_TYPE].value_counts(dropna=False)
# display(values_count)
# display_perc(values_count / values_count.sum() * 100)
for name, pred_df in sorted(pred_dfs_dict.items()):
print(name)
pred_w_ct = pred_df.merge(
meta_df, left_on="md5sum", right_on="FILE_accession", how="inner"
)
N = pred_w_ct.shape[0]
# Calculate results for all predictions
true, pred = pred_w_ct[CELL_TYPE], pred_w_ct["Predicted class"]
total_correct = (true == pred).sum()
acc = total_correct / N
f1 = f1_score(true, pred, labels=pred.unique(), average="macro")
print(f"Acc (pred>0.0): {total_correct}/{N} ({acc:.2%})")
print(f"F1 (pred>0.0): {f1:.2f}")
# Calculate results for predictions with max_pred
pred_w_ct_filtered = pred_w_ct[pred_w_ct["Max pred"] > min_pred]
true, pred = pred_w_ct_filtered[CELL_TYPE], pred_w_ct_filtered["Predicted class"]
total_correct_filtered = (true == pred).sum()
perc_filtered = total_correct_filtered / pred_w_ct_filtered.shape[0]
f1 = f1_score(true, pred, labels=pred.unique(), average="macro")
print(
f"Acc (pred>{min_pred:.1f}): {total_correct_filtered}/{pred_w_ct_filtered.shape[0]} ({perc_filtered:.2%})"
)
diff = N - pred_w_ct_filtered.shape[0]
print(f"F1 (pred>{min_pred}): {f1:.2f}")
print(f"Samples ignored at {min_pred:.1f}: {diff} ({diff/N:.2%})\n")
mask_core_assays = complete_metadata_df[ASSAY].isin(core_assays)
non_core_metadata_df = complete_metadata_df[~mask_core_assays]
core_metadata_df = complete_metadata_df[mask_core_assays]
compute_cell_type_acc(non_core_metadata_df, other_ct_dfs)
print("\n")
compute_cell_type_acc(core_metadata_df, other_ct_dfs)
assay11-ct16 Acc (pred>0.0): 223/262 (85.11%) F1 (pred>0.0): 0.66 Acc (pred>0.6): 168/173 (97.11%) F1 (pred>0.6): 0.88 Samples ignored at 0.6: 89 (33.97%) assay7-ct16 Acc (pred>0.0): 199/262 (75.95%) F1 (pred>0.0): 0.56 Acc (pred>0.6): 173/185 (93.51%) F1 (pred>0.6): 0.93 Samples ignored at 0.6: 77 (29.39%) assay7-ct19 Acc (pred>0.0): 134/262 (51.15%) F1 (pred>0.0): 0.49 Acc (pred>0.6): 105/176 (59.66%) F1 (pred>0.6): 0.66 Samples ignored at 0.6: 86 (32.82%) h3k27ac_only Acc (pred>0.0): 169/262 (64.50%) F1 (pred>0.0): 0.48 Acc (pred>0.6): 85/90 (94.44%) F1 (pred>0.6): 0.93 Samples ignored at 0.6: 172 (65.65%) h3k27me3_only Acc (pred>0.0): 34/262 (12.98%) F1 (pred>0.0): 0.04 Acc (pred>0.6): 17/143 (11.89%) F1 (pred>0.6): 0.10 Samples ignored at 0.6: 119 (45.42%) h3k36me3_only Acc (pred>0.0): 170/262 (64.89%) F1 (pred>0.0): 0.49 Acc (pred>0.6): 66/75 (88.00%) F1 (pred>0.6): 0.87 Samples ignored at 0.6: 187 (71.37%) h3k4me1_only Acc (pred>0.0): 164/262 (62.60%) F1 (pred>0.0): 0.47 Acc (pred>0.6): 80/87 (91.95%) F1 (pred>0.6): 0.97 Samples ignored at 0.6: 175 (66.79%) h3k4me3_only Acc (pred>0.0): 165/262 (62.98%) F1 (pred>0.0): 0.50 Acc (pred>0.6): 144/173 (83.24%) F1 (pred>0.6): 0.68 Samples ignored at 0.6: 89 (33.97%) h3k9me3_only Acc (pred>0.0): 39/262 (14.89%) F1 (pred>0.0): 0.05 Acc (pred>0.6): 16/142 (11.27%) F1 (pred>0.6): 0.20 Samples ignored at 0.6: 120 (45.80%) input_only Acc (pred>0.0): 49/262 (18.70%) F1 (pred>0.0): 0.11 Acc (pred>0.6): 42/205 (20.49%) F1 (pred>0.6): 0.19 Samples ignored at 0.6: 57 (21.76%) assay11-ct16 Acc (pred>0.0): 1078/1176 (91.67%) F1 (pred>0.0): 0.74 Acc (pred>0.6): 993/1022 (97.16%) F1 (pred>0.6): 0.92 Samples ignored at 0.6: 154 (13.10%) assay7-ct16 Acc (pred>0.0): 1070/1176 (90.99%) F1 (pred>0.0): 0.73 Acc (pred>0.6): 962/981 (98.06%) F1 (pred>0.6): 0.93 Samples ignored at 0.6: 195 (16.58%) assay7-ct19 Acc (pred>0.0): 1080/1176 (91.84%) F1 (pred>0.0): 0.65 Acc (pred>0.6): 941/961 (97.92%) F1 (pred>0.6): 0.85 Samples ignored at 0.6: 215 (18.28%) h3k27ac_only Acc (pred>0.0): 564/1176 (47.96%) F1 (pred>0.0): 0.34 Acc (pred>0.6): 372/410 (90.73%) F1 (pred>0.6): 0.73 Samples ignored at 0.6: 766 (65.14%) h3k27me3_only Acc (pred>0.0): 257/1176 (21.85%) F1 (pred>0.0): 0.16 Acc (pred>0.6): 182/416 (43.75%) F1 (pred>0.6): 0.51 Samples ignored at 0.6: 760 (64.63%) h3k36me3_only Acc (pred>0.0): 662/1176 (56.29%) F1 (pred>0.0): 0.38 Acc (pred>0.6): 482/612 (78.76%) F1 (pred>0.6): 0.70 Samples ignored at 0.6: 564 (47.96%) h3k4me1_only Acc (pred>0.0): 597/1176 (50.77%) F1 (pred>0.0): 0.32 Acc (pred>0.6): 518/670 (77.31%) F1 (pred>0.6): 0.65 Samples ignored at 0.6: 506 (43.03%) h3k4me3_only Acc (pred>0.0): 578/1176 (49.15%) F1 (pred>0.0): 0.35 Acc (pred>0.6): 449/576 (77.95%) F1 (pred>0.6): 0.59 Samples ignored at 0.6: 600 (51.02%) h3k9me3_only Acc (pred>0.0): 279/1176 (23.72%) F1 (pred>0.0): 0.15 Acc (pred>0.6): 202/504 (40.08%) F1 (pred>0.6): 0.44 Samples ignored at 0.6: 672 (57.14%) input_only Acc (pred>0.0): 349/1176 (29.68%) F1 (pred>0.0): 0.20 Acc (pred>0.6): 274/721 (38.00%) F1 (pred>0.6): 0.26 Samples ignored at 0.6: 455 (38.69%)
conf_matrix_logdir = (
base_fig_dir / "encode_predictions" / "confusion_matrices" / CELL_TYPE / "core"
)
if not conf_matrix_logdir.exists():
conf_matrix_logdir.mkdir(parents=True)
meta_df = core_metadata_df[core_metadata_df[CELL_TYPE].isin(EPIATLAS_16_CT)].copy()
limited_pred_dfs_dict = {k: v for k, v in other_ct_dfs.items() if "-ct16" in k}
for no_rna in [True, False]:
# continue
for task_name, df in limited_pred_dfs_dict.items():
pred_w_ct = df.merge(
meta_df, left_on="md5sum", right_on="FILE_accession", how="inner"
)
if no_rna:
pred_w_ct = pred_w_ct[~pred_w_ct[ASSAY].str.contains("rna")]
for threshold in [0, 0.6, 0.8]:
sub_df = pred_w_ct[pred_w_ct["Max pred"] >= threshold]
true, pred = sub_df[CELL_TYPE], sub_df["Predicted class"]
cm = confusion_matrix(true, pred, labels=EPIATLAS_16_CT)
filename = f"{task_name}-core-confusion_matrix-{threshold*100}"
if no_rna:
final_filename = f"{filename}-no_rna"
this_logdir = conf_matrix_logdir / "no_rna"
this_logdir.mkdir(parents=True, exist_ok=True)
else:
final_filename = filename
this_logdir = conf_matrix_logdir / "with_rna"
this_logdir.mkdir(parents=True, exist_ok=True)
writer = ConfusionMatrixWriter(labels=EPIATLAS_16_CT, confusion_matrix=cm)
writer.to_all_formats(
logdir=this_logdir,
name=final_filename,
)
plt.close("all")
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide
No RNA-seq here
Download note
paper_dir="/home/local/USHERBROOKE/rabj2301/Projects/epiclass/output/paper/data/training_results/dfreeze_v2/hg38_100kb_all_none/assay_epiclass_1l_3000n"
cd $paper_dir
base_path="/lustre06/project/6007515/rabyj/epiclass-project/output/epiclass-logs/epiatlas-dfreeze-v2.1/hg38_100kb_all_none/assay_epiclass_1l_3000n"
rsync -avR --exclude "*/EpiLaP/" --exclude "*.png" --exclude "*confusion*" --exclude "*.md5" narval:${base_path}/./*c/complete_no_valid_oversample .
paper_dir="/home/local/USHERBROOKE/rabj2301/Projects/epiclass/output/paper/data/training_results/dfreeze_v2"
cd $paper_dir
base_path="/lustre06/project/6007515/rabyj/epiclass-project/output/epiclass-logs/epiatlas-dfreeze-v2.1"
rsync -avR --exclude "*/EpiLaP/" --exclude "*.png" --exclude "*confusion*" --exclude "*.md5" narval:${base_path}/./hg38_100kb_all_none_w_encode_noncore/assay_epiclass_1l_3000n/complete_no_valid_oversample-0 .
find -type f -name "*.list*.csv" -print0 | xargs -0 rename 's/\.list//g'
data_dir = base_data_dir / "training_results" / "dfreeze_v2"
assay7_folder = (
data_dir / f"hg38_100kb_all_none/{ASSAY}_1l_3000n/7c/complete_no_valid_oversample"
)
assay11_folder = (
data_dir / f"hg38_100kb_all_none/{ASSAY}_1l_3000n/11c/complete_no_valid_oversample"
)
assay13_folder = (
data_dir
/ f"hg38_100kb_all_none_w_encode_noncore/{ASSAY}_1l_3000n/13c/complete_no_valid_oversample"
)
pred_dfs_dict = {}
for name, folder in zip(
["7c", "11c", "13c"], [assay7_folder, assay11_folder, assay13_folder]
):
if not folder.exists():
print(f"Folder {folder} does not exist.")
continue
pred_folder = folder / "predictions" / "encode"
if not pred_folder.exists():
print(f"Folder {pred_folder} does not exist.")
continue
pred_file = list(pred_folder.glob("*.csv"))
if len(pred_file) != 1:
print(f"Found {len(pred_file)} files in {pred_folder}.")
continue
pred_file = pred_file[0]
pred_df = pd.read_csv(pred_file, sep=",")
try:
pred_df.drop(columns=["Same?"], inplace=True)
except KeyError:
pass
# Add assay metadata
pred_df = pred_df.merge(
complete_metadata_df, left_on="md5sum", right_on="FILE_accession", how="left"
)
pred_df["True class"] = pred_df["assay_epiclass"]
pred_dfs_dict[name] = pred_df
output_dir = data_dir = (
base_data_dir
/ "training_results"
/ "predictions"
/ "encode"
/ "assay_epiclass_1l_3000n"
)
for name, df in pred_dfs_dict.items():
# continue
print(name)
print(df.shape)
# Only consider files already labeled with core7 assays
df = df[df[ASSAY].isin(CORE_ASSAYS)]
# Only consider non-EpiAtlas samples
df = df[df["in_epiatlas"].astype(str) == "False"]
# Calculate results for all predictions
correct_pred = df["Predicted class"] == df["True class"]
total_correct = correct_pred.sum()
total = df.shape[0]
perc = total_correct / total
print(f"Acc (pred>=0.0) {total_correct}/{total} ({perc:.2%})")
for assay in CORE_ASSAYS:
min_pred = 0.6
df_assay = df[df[ASSAY] == assay]
df_assay = df_assay[df_assay["Max pred"] >= min_pred]
correct_pred = df_assay["Predicted class"] == df_assay["True class"]
total_correct = correct_pred.sum()
total = df_assay.shape[0]
perc = total_correct / total
print(
f"Acc (pred>={min_pred:.1f}) {assay} = {total_correct}/{total} ({perc:.2%})"
)
# Calculate results for predictions with max_pred > 0.6
df_filtered = df[df["Max pred"] >= 0.6]
correct_pred_filtered = df_filtered["Predicted class"] == df_filtered["True class"]
total_correct_filtered = correct_pred_filtered.sum()
total_filtered = df_filtered.shape[0]
perc_filtered = total_correct_filtered / total_filtered
print(
f"Acc (pred>=0.6): {total_correct_filtered}/{total_filtered} ({perc_filtered:.2%})"
)
df_filtered_wrong = df_filtered[~correct_pred_filtered]
# groupby = (
# df_filtered_wrong.groupby(["True class", "Predicted class"])
# .size()
# .sort_values(ascending=False)
# )
# display("Mislabels:", groupby)
df_filtered_wrong.to_csv(
output_dir / f"encode_only_mislabels_minPred0.6_{name}.csv", index=False
)
7c (9619, 220) Acc (pred>=0.0) 3013/3066 (98.27%) Acc (pred>=0.6) h3k4me3 = 378/379 (99.74%) Acc (pred>=0.6) h3k27ac = 366/366 (100.00%) Acc (pred>=0.6) h3k4me1 = 254/256 (99.22%) Acc (pred>=0.6) h3k36me3 = 232/242 (95.87%) Acc (pred>=0.6) h3k27me3 = 285/291 (97.94%) Acc (pred>=0.6) h3k9me3 = 241/241 (100.00%) Acc (pred>=0.6) input = 1224/1236 (99.03%) Acc (pred>=0.6): 2980/3011 (98.97%) 11c (9619, 224) Acc (pred>=0.0) 2961/3066 (96.58%) Acc (pred>=0.6) h3k4me3 = 378/378 (100.00%) Acc (pred>=0.6) h3k27ac = 365/365 (100.00%) Acc (pred>=0.6) h3k4me1 = 254/255 (99.61%) Acc (pred>=0.6) h3k36me3 = 230/242 (95.04%) Acc (pred>=0.6) h3k27me3 = 285/291 (97.94%) Acc (pred>=0.6) h3k9me3 = 235/239 (98.33%) Acc (pred>=0.6) input = 1146/1179 (97.20%) Acc (pred>=0.6): 2893/2949 (98.10%) 13c (9619, 226) Acc (pred>=0.0) 2532/3066 (82.58%) Acc (pred>=0.6) h3k4me3 = 377/378 (99.74%) Acc (pred>=0.6) h3k27ac = 345/357 (96.64%) Acc (pred>=0.6) h3k4me1 = 253/255 (99.22%) Acc (pred>=0.6) h3k36me3 = 229/239 (95.82%) Acc (pred>=0.6) h3k27me3 = 281/290 (96.90%) Acc (pred>=0.6) h3k9me3 = 238/240 (99.17%) Acc (pred>=0.6) input = 667/1037 (64.32%) Acc (pred>=0.6): 2390/2796 (85.48%)
We're using manually labeled assay categories for non-core assays.
We labeled each assay having at least 3 files, and their related family members.
We're not including CTCF in this analysis.
non_core_preds = pred_dfs_dict["7c"].copy(deep=True)
non_core_preds = non_core_preds[non_core_preds[ASSAY].isin(["non-core", "ctcf"])]
print(f"Non-core datasets: {non_core_preds.shape[0]}")
N_high_conf = (non_core_preds["Max pred"] >= 0.6).sum()
N_total = non_core_preds.shape[0]
print(
f"High confidence non-core predictions: {N_high_conf / N_total:.2%} ({N_high_conf}/{N_total})"
)
Non-core datasets: 4061 High confidence non-core predictions: 72.59% (2948/4061)
output_dir = data_dir = (
base_data_dir
/ "training_results"
/ "predictions"
/ "encode"
/ "assay_epiclass_1l_3000n"
)
if not output_dir.exists():
output_dir.mkdir(parents=True)
df = non_core_preds
for min_pred in [0, 0.6, 0.8]:
# continue
df_filtered = df[df["Max pred"] >= min_pred]
groupby = (
df_filtered.groupby(["Predicted class", "assay"])
.size()
.reset_index(name="Count")
.sort_values(by=["Predicted class", "Count"], ascending=[True, False])
.set_index(["Predicted class", "assay"])["Count"]
)
groupby.to_csv(
output_dir / f"encode_non-core_{name}_predictions_minPred{min_pred}.csv"
)
encode_metadata_dir = base_data_dir / "metadata/encode"
non_core_categories_path = (
encode_metadata_dir / "non-core_encode_assay_category_2024-08-29.csv"
)
if not non_core_categories_path.exists():
raise FileNotFoundError(f"File {non_core_categories_path} does not exist.")
non_core_categories_df = pd.read_csv(
non_core_categories_path, sep=",", names=["assay", "assay_category", "note"]
)
print(non_core_categories_df.shape)
(1171, 3)
df_w_cats = non_core_preds.merge(
non_core_categories_df[["assay", "assay_category"]],
left_on="assay",
right_on="assay",
how="left",
)
print(df_w_cats.shape)
df_w_cats.fillna("not_looked", inplace=True)
display(df_w_cats["assay_category"].value_counts(dropna=False))
(4061, 221)
assay_category trx_reg 1978 not_looked 1336 insulator 526 polycomb 94 other/mixed 72 heterochrom 42 splicing 13 Name: count, dtype: int64
def create_non_core_preds_df(df: pd.DataFrame, min_pred: float = 0.6):
"""Create a DataFrame of non-core assay predictions."""
results = {}
assay_categories = dict(zip(df["assay"], df["assay_category"]))
for assay, group in df.groupby("assay"):
group = group[group["Max pred"] >= min_pred]
groupby = (
group.groupby(["Predicted class"])
.size()
.reset_index(name="Count") # type: ignore
.sort_values(by=["Count"], ascending=False)
)
results[assay] = dict(zip(groupby["Predicted class"], groupby["Count"]))
result_df = pd.DataFrame(results).fillna(0)
result_df = result_df.astype(int)
result_df = result_df.T # assay as row/index
result_df["assay_category"] = result_df.index.map(assay_categories)
return result_df
for min_pred in [0, 0.6, 0.8]:
# continue
predicted_classes_df = create_non_core_preds_df(df_w_cats, min_pred=min_pred)
predicted_classes_df.to_csv(
output_dir
/ f"encode_non-core_{name}_predictions_per_assay_minPred{min_pred:.2f}.csv"
)
section_fig_dir = base_fig_dir / "encode_predictions" / "assay_epiclass" / "non-core"
if not section_fig_dir.exists():
raise FileNotFoundError(f"Directory {section_fig_dir} does not exist.")
def create_structured_dataframe(df_w_cats):
"""Create a structured dataframe with the percentage of predictions for each assay category."""
# Create an empty list to store our data
data = []
# Iterate through the grouped data
for predicted_class, group in df_w_cats.groupby("Predicted class"):
for min_pred in list(np.arange(0, 1, 0.05)) + [0.99]:
df_filtered = group[group["Max pred"] >= min_pred]
counts = df_filtered["assay_category"].value_counts(dropna=False)
total = counts.sum()
# Calculate percentages
percentages = (counts / total * 100).round(2)
# Add data for each assay category
for assay_category, percentage in percentages.items():
data.append(
{
"Predicted class": predicted_class,
"Min pred": min_pred,
"assay_category": assay_category,
"Percentage": percentage,
"Count": counts[assay_category],
"Total samples": total,
}
)
# Create the dataframe
df_structured = pd.DataFrame(data)
# Set the multi-index
df_structured = df_structured.set_index(
["Predicted class", "Min pred", "assay_category"]
)
return df_structured
assay_category_df = create_structured_dataframe(df_w_cats)
assay_category_df.to_csv(output_dir / "encode_non-core_7c_predictions_assay_category.csv")
df = df_w_cats[df_w_cats["assay_category"] != "not_looked"]
print(df.shape)
print(f"{df['assay'].nunique()} unique assays.")
df_w_cats["assay"].value_counts(dropna=False)
(2725, 221) 238 unique assays.
assay
ctcf 468
h3k9ac 117
polr2a 115
h3k4me2 77
h4k20me1 57
...
prdm6 1
glis2 1
znf546 1
mterf2 1
bnc2 1
Name: count, Length: 1170, dtype: int64
assay_epiclass_order = [
"h3k27ac",
"h3k4me3",
"h3k4me1",
"h3k9me3",
"h3k27me3",
"h3k36me3",
"input",
]
assay_epiclass_order = {assay: i for i, assay in enumerate(assay_epiclass_order)}
fig_dir = section_fig_dir / "stacked_bar_X_assay_category"
fig_dir.mkdir(parents=False, exist_ok=True)
assay_categories_order = [
"trx_reg",
"heterochrom",
"polycomb",
"splicing",
"insulator",
"other/mixed",
]
# for min_pred in [0, 0.6, 0.8]:
for min_pred in [0.6]:
sub_df = df[df["Max pred"] >= min_pred]
print(f"Min pred: {min_pred}, Samples: {sub_df.shape[0]} ({sub_df.shape[0]/df.shape[0]:.2%})")
groupby = (
sub_df.groupby(["assay_category", "Predicted class"])
.size()
.reset_index(name="Count")
.sort_values(by=["assay_category", "Count"], ascending=[True, False])
)
groupby["Percentage"] = groupby.groupby("assay_category")["Count"].transform(
lambda x: (x / x.sum()) * 100
)
# Add order for plotting
groupby["assay_order"] = groupby["Predicted class"].map(assay_epiclass_order)
groupby = groupby.sort_values(
by=["assay_category", "assay_order"], ascending=[False, True]
)
print(groupby, "\n")
# Main plot
fig = px.bar(
groupby,
x="assay_category",
y="Percentage",
color="Predicted class",
barmode="stack",
category_orders={"assay_category": assay_categories_order},
color_discrete_map=assay_colors,
title=f"core7 predictions for non-core assays, predScore >= {min_pred:.2f}",
labels={"Percentage": "Fraction (%)", "assay_category": "Assay Category"},
)
# Modify x-axis labels
total_counts = groupby.groupby("assay_category")["Count"].sum()
ticktext = [
f"{assay_category} (N={total_counts[assay_category]})"
for assay_category in assay_categories_order
]
fig.update_xaxes(tickvals=assay_categories_order, ticktext=ticktext)
# Save and display
figname = f"histogram_encode_non-core_assay_epiclass_minPred{min_pred:.2f}"
fig.write_html(fig_dir / f"{figname}.html")
fig.write_image(fig_dir / f"{figname}.png")
fig.write_image(fig_dir / f"{figname}.svg")
fig.show()
Min pred: 0.6, Samples: 2055 (75.41%) assay_category Predicted class Count Percentage assay_order 19 trx_reg h3k27ac 628 39.797212 0 23 trx_reg h3k4me3 538 34.093790 1 22 trx_reg h3k4me1 195 12.357414 2 24 trx_reg h3k9me3 7 0.443599 3 20 trx_reg h3k27me3 6 0.380228 4 21 trx_reg h3k36me3 4 0.253485 5 25 trx_reg input 200 12.674271 6 16 splicing h3k27ac 1 10.000000 0 17 splicing h3k36me3 7 70.000000 5 18 splicing input 2 20.000000 6 14 polycomb h3k4me3 7 8.536585 1 13 polycomb h3k27me3 74 90.243902 4 15 polycomb input 1 1.219512 6 8 other/mixed h3k27ac 12 27.906977 0 10 other/mixed h3k4me3 12 27.906977 1 9 other/mixed h3k4me1 6 13.953488 2 11 other/mixed h3k9me3 2 4.651163 3 12 other/mixed input 11 25.581395 6 4 insulator h3k27ac 3 0.983607 0 6 insulator h3k4me3 46 15.081967 1 5 insulator h3k27me3 2 0.655738 4 7 insulator input 254 83.278689 6 0 heterochrom h3k27ac 1 2.702703 0 1 heterochrom h3k4me3 2 5.405405 1 2 heterochrom h3k9me3 24 64.864865 3 3 heterochrom input 10 27.027027 6
def create_assay_category_graphs(df, output_dir: Path | None = None):
"""Graph assay category distribution for each predicted class."""
# Get unique predicted classes
predicted_classes = df.index.get_level_values("Predicted class").unique()
assay_categories = df.index.get_level_values("assay_category").unique()
graph_colors = {
cat: px.colors.qualitative.Safe[i]
for i, cat in enumerate(sorted(assay_categories))
}
# Create a figure for each predicted class
for predicted_class in predicted_classes:
df_class = df.loc[predicted_class]
# Get unique assay categories for this predicted class
assay_categories = df_class.index.get_level_values("assay_category").unique()
total_samples_at_zero = df_class.xs(0, level="Min pred")["Total samples"].iloc[0]
# Create the figure
fig = go.Figure()
for assay_category in assay_categories:
df_assay = df_class.xs(assay_category, level="assay_category")
fig.add_trace(
go.Scatter(
x=df_assay.index,
y=df_assay["Percentage"],
mode="lines+markers",
name=assay_category,
marker=dict(color=graph_colors[assay_category]),
)
)
conserved_percentages = (
df_class.groupby("Min pred")["Total samples"].first()
/ total_samples_at_zero
* 100
)
fig.add_trace(
go.Scatter(
x=conserved_percentages.index,
y=conserved_percentages.values,
mode="lines+markers",
name="Samples Conserved",
line=dict(dash="dash", color="black"),
)
)
# Update layout
fig.update_layout(
title=f"Composition for Predicted Class: {predicted_class}",
xaxis_title="Min pred",
yaxis_title="Percentage Composition",
legend_title="Assay Category",
hovermode="x unified",
)
fig.update_xaxes(range=[-0.01, 1.01])
fig.update_yaxes(range=[0, 100])
# Save
if output_dir:
filename = f"encode_non-core_7c_predictions_assay_category_{predicted_class}"
fig.write_image(output_dir / f"{filename}.png")
fig.write_image(output_dir / f"{filename}.svg")
fig.write_html(output_dir / f"{filename}.html")
fig.show()
# Assuming df_structured is your dataframe from the previous step
fig_dir = (
base_fig_dir
/ "encode_predictions"
/ "assay_epiclass"
/ "non-core"
/ "line_graphs_over_min_pred"
)
fig_dir.mkdir(parents=False, exist_ok=True)
# create_assay_category_graphs(df=assay_category_df, output_dir=fig_dir)
create_assay_category_graphs(df=assay_category_df)
Throwing all the predictions together to get acc/F1 for each of 5 classifiers, on core/non-core data respectively. (for assay it gets more messy, cannot do non-core directly)
# create new life stage classification
merge_life_stage = {
"adult": "adult",
"embryo": "prenatal",
"fetal": "prenatal",
"newborn": "prenatal",
"child": "child",
}
all_preds_no_epiatlas = all_preds_no_epiatlas.copy(
deep=True
) # to avoid SettingWithCopyWarning
for label in [
LIFE_STAGE,
f"True class ({LIFE_STAGE})",
f"Predicted class ({LIFE_STAGE})",
]:
new_label = label.replace(LIFE_STAGE, f"{LIFE_STAGE}_merged")
all_preds_no_epiatlas[new_label] = all_preds_no_epiatlas[label].map(merge_life_stage)
all_preds_no_epiatlas[f"Max pred ({LIFE_STAGE}_merged)"] = all_preds_no_epiatlas[
f"Max pred ({LIFE_STAGE})"
]
folders_to_save = [
base_fig_dir / "encode_predictions",
base_data_dir / "training_results" / "predictions" / "encode",
table_dir / "dfreeze_v2" / "predictions" / "metrics",
]
metrics_handler = MetricsPerAssay()
filename = "ENCODE_7tasks_acc_per_assay_NO_EpiAtlas"
dfs_dict: Dict[str, pd.DataFrame] = metrics_handler.compute_multiple_metric_formats( # type: ignore
preds=all_preds_no_epiatlas,
folders_to_save=folders_to_save,
general_filename=filename,
verbose=False,
return_df=True,
)
# For future use
structured_df = dfs_dict[f"{filename}.tsv"]
# Keep only the 16 cell types
preds_no_epiatlas_16ct = all_preds_no_epiatlas[
all_preds_no_epiatlas[CELL_TYPE].isin(EPIATLAS_16_CT)
]
filename = "ENCODE_7tasks_acc_per_assay_NO_EpiAtlas_16ct"
metrics_handler.compute_multiple_metric_formats(
preds=preds_no_epiatlas_16ct,
folders_to_save=folders_to_save,
general_filename=filename,
verbose=False,
)
# No cell ine
df = all_preds_no_epiatlas.copy(deep=True)
df[BIOMAT].fillna("unknown", inplace=True)
df = df[~df[BIOMAT].isin(["cell line", "other", "unknown"])]
filename = "ENCODE_7tasks_acc_per_assay_NO_EpiAtlas_no_cell_line"
metrics_handler.compute_multiple_metric_formats(
preds=df,
folders_to_save=folders_to_save,
general_filename=filename,
verbose=False,
)
def plot_encode_metrics_per_assay(
df_acc_per_assay: pd.DataFrame, min_pred: float = 0, logdir: Path | None = None
) -> None:
"""Plot accuracy+F1 of each classification task, per assay"""
df = copy.deepcopy(df_acc_per_assay)
# Selecting min_pred
to_plot = df[
(df["min_predScore"] > (min_pred - 0.01))
& (df["min_predScore"] < (min_pred + 0.01))
]
# sort tasks by avg_acc
averages = to_plot[to_plot[ASSAY] == "avg-core"]
avg_acc = list(zip(averages["acc"], averages["task_name"]))
task_order = [
task_name for _, task_name in sorted(avg_acc, key=lambda x: x[0], reverse=True)
]
# Removing undesired assays
to_plot = to_plot[to_plot[ASSAY].isin(CORE_ASSAYS + ["rna_seq"])]
names = {
f"{ASSAY}_7c": "assay_7c",
f"{ASSAY}_11c": "assay_11c",
LIFE_STAGE: "life stage",
f"{LIFE_STAGE}_merged": "life stage (merged)",
CELL_TYPE: "cell type",
SEX: "sex",
CANCER: "cancer",
BIOMAT: "biomaterial_type",
}
# Plot each task
for metric in ["acc", "f1-score"]:
fig = go.Figure()
for task_name in task_order:
task_df = to_plot[to_plot["task_name"] == task_name]
task_name = names[task_name]
if task_name in ["assay_7c", "assay_11c"] and metric == "f1-score":
continue
fig.add_trace(
go.Box(
x=[task_name] * len(task_df),
y=task_df[metric],
name=metric,
boxpoints="outliers",
boxmean=True,
marker_color="gray",
showlegend=False,
hoverinfo="skip",
)
)
fig.add_trace(
go.Scatter(
x=[task_name] * len(task_df),
y=task_df[metric],
mode="markers",
name=task_name,
marker_color=[assay_colors[assay] for assay in task_df[ASSAY]],
hoverinfo="text",
hovertext=[
f"{assay}: {value:.3f}"
for assay, value in zip(task_df[ASSAY], task_df[metric])
],
showlegend=False,
)
)
y_axis_label = "F1-score" if metric == "f1-score" else "Accuracy"
fig.update_layout(
xaxis_title="Classification task",
yaxis_title=y_axis_label,
font=dict(size=18),
width=800,
height=600,
title=f"ENCODE: Task {y_axis_label} per assay (min_predScore={min_pred:.2f})",
)
fig.update_yaxes(range=[0, 1.01])
# Show/Write the plot
if logdir:
filename = f"encode_6tasks_metrics_per_assay-{metric}-{min_pred*100:.0f}"
fig.write_image(logdir / f"{filename}.png")
fig.write_image(logdir / f"{filename}.svg")
fig.write_html(logdir / f"{filename}.html")
fig.show()
logdir = base_fig_dir / "encode_predictions" / "metrics_per_assay"
if not logdir.exists():
logdir.mkdir(parents=True)
for min_pred in [0, 0.6, 0.8, 0.9]:
plot_encode_metrics_per_assay(structured_df, min_pred=min_pred, logdir=logdir)
def plot_all_acc_per_assay(
graph_df: pd.DataFrame, minY, maxY, logdir: Path | None = None
):
"""Plot accuracy per assay, per min_predScore, per scatter_name/task_name,
for core vs non-core assays.
"""
min_predScore_color_map = {"0.0": "blue", "0.6": "orange", "0.9": "red"}
graph_df["scatter_name"] = graph_df["task_name"].replace(
"harmonized_", "", regex=True
)
graph_df = graph_df.sort_values(by=[ASSAY, "min_predScore", "scatter_name"])
for graph_type in ["core", "non-core"]:
df = graph_df.copy(deep=True)
if graph_type == "core":
df = df[df[ASSAY].isin(CORE_ASSAYS + ["rna_seq"])]
minY = 0.55
maxY = 1.001
elif graph_type == "non-core":
df = df[~df[ASSAY].isin(CORE_ASSAYS)]
minY = 0
maxY = 1
else:
raise ValueError(f"Invalid graph type: {graph_type}")
unique_assays = list(df[ASSAY].unique())
# Calculate average over assays
avg_df = df.groupby(["min_predScore", "scatter_name"])["acc"].mean().reset_index()
avg_df[ASSAY] = "Average"
# traces_per_assay = df["scatter_name"].nunique()
fig = go.Figure()
for min_pred in ["0.0", "0.6", "0.9"]:
df_subset = df[df["min_predScore"] == min_pred]
avg_subset = avg_df[avg_df["min_predScore"] == min_pred]
# Add average over assay trace
fig.add_trace(
go.Scatter(
x=["Average - " + name for name in avg_subset["scatter_name"]],
y=avg_subset["acc"],
mode="markers",
name=f"Avg Min Pred Score: {min_pred}",
marker=dict(
color=min_predScore_color_map[min_pred],
size=9,
symbol="star",
),
hoverinfo="y+x",
showlegend=False,
)
)
# Add individual assay traces
hovertext = list(
zip(
df_subset[ASSAY],
df_subset["nb_samples"].apply(lambda x: f"Samples: {x}"),
)
)
fig.add_trace(
go.Scatter(
x=df_subset[ASSAY] + " - " + df_subset["scatter_name"],
y=df_subset["acc"],
mode="markers",
name=f"Min Pred Score: {min_pred}",
marker=dict(
color=min_predScore_color_map[min_pred],
size=9,
),
text=hovertext,
hoverinfo="text+y+x",
)
)
# Modify x-axis tick labels
ticktext = []
tick_group = list(df_subset["scatter_name"].unique())
for i, tick in enumerate(tick_group):
tick_group[i] = f"<b>{tick}</b>"
for i in range(len(unique_assays) + 1):
ticktext.extend(tick_group)
fig.update_xaxes(
tickmode="array", ticktext=ticktext, tickvals=list(range(len(ticktext)))
)
# Add assay labels on top + vertical lines between assay groups
fig.add_annotation(
x=len(tick_group) / 2 - 0.5,
y=1.05,
yref="paper",
text="Average",
showarrow=False,
font=dict(size=14),
)
fig.add_vline(
x=len(tick_group) - 0.5, line_width=2, line_dash="solid", line_color="black"
)
fig.add_hline(y=1, line_width=1, line_color="black")
for i, label in enumerate(unique_assays):
fig.add_annotation(
x=(i + 1) * len(tick_group) + len(tick_group) / 2 - 0.5,
y=1.05,
yref="paper",
text=label,
showarrow=False,
font=dict(size=14),
)
fig.add_vline(
x=(i + 1) * len(tick_group) - 0.5,
line_width=1,
line_dash="dash",
line_color="black",
)
# titles + yaxis range
fig.update_layout(
title="ENCODE data - Label match per Assay and Task",
xaxis_title="Assay - Task",
yaxis_title="Match %",
xaxis_tickangle=-45,
showlegend=True,
height=600,
width=1200,
yaxis=dict(tickformat=".2%", range=[minY, maxY]),
)
# Show/Write the plot
print(f"Graphing {graph_type}")
if logdir:
figname = f"encode_{graph_type}_acc_per_assay_minY{minY:.2f}"
fig.write_html(logdir / f"{figname}.html")
fig.write_image(logdir / f"{figname}.png")
fig.write_image(logdir / f"{figname}.svg")
fig.show()
# this_fig_dir = base_fig_dir / "encode_predictions" / "acc_per_assay"
# if not this_fig_dir.exists():
# raise FileNotFoundError(f"Folder {this_fig_dir} does not exist")
def graph_confusion_matrix(all_preds: pd.DataFrame, output_dir: Path):
"""Graph confusion matrix for each classification task, for both core and non-core assays"""
df = copy.deepcopy(all_preds)
assay_merger = {
"mrna_seq": "rna_seq",
"wgbs-pbat": "wgbs",
"wgbs-standard": "wgbs",
}
for col in df.columns:
if ASSAY in col:
df[col].replace(assay_merger, inplace=True)
for graph_type in ["core", "non-core"]:
print(f"Graphing {graph_type}")
if graph_type == "core":
sub_df = df[df[ASSAY].isin(ASSAY_ORDER)]
elif graph_type == "non-core":
sub_df = df[df[ASSAY].isin(["ctcf", "non-core"])]
else:
raise ValueError(f"Invalid graph_type: {graph_type}")
for name in [
f"{ASSAY}_7c",
f"{ASSAY}_11c",
CELL_TYPE,
SEX,
LIFE_STAGE,
f"{LIFE_STAGE}_merged",
CANCER,
]:
logdir = output_dir / name
if not logdir.exists():
logdir.mkdir(parents=True)
if name == CELL_TYPE and graph_type == "core":
continue
if name in [f"{ASSAY}_7c", f"{ASSAY}_11c"] and graph_type == "non-core":
continue
y_true_col = f"True class ({name})"
y_pred_col = f"Predicted class ({name})"
max_pred_label = f"Max pred ({name})"
task_df = sub_df.copy(deep=True)
task_df = task_df.fillna("unknown")
for col in [y_true_col, y_pred_col]:
task_df = task_df[task_df[col] != "unknown"]
if name == CELL_TYPE:
task_df = task_df[task_df[CELL_TYPE].isin(EPIATLAS_16_CT)]
print(name, task_df.shape)
for threshold in [0, 0.6, 0.8, 0.9]:
filtered_df = task_df[task_df[max_pred_label] >= threshold]
if filtered_df.shape[0] == 0:
print(f"No data for {name} with threshold {threshold}. Skipping.")
continue
true, pred = filtered_df[y_true_col], filtered_df[y_pred_col]
if name == CELL_TYPE:
labels = EPIATLAS_16_CT
else:
labels = set(true.unique()) | set(pred.unique())
labels = sorted(labels)
cm = confusion_matrix(true, pred, labels=labels)
writer = ConfusionMatrixWriter(labels=labels, confusion_matrix=cm)
writer.to_all_formats(
logdir=logdir,
name=f"{name}-{graph_type}-confusion_matrix-{threshold*100:.0f}",
)
plt.close("all")
cm_logdir = base_fig_dir / "encode_predictions" / "confusion_matrices"
graph_confusion_matrix(all_preds_no_epiatlas, cm_logdir)
Graphing core assay_epiclass_7c (3066, 280) assay_epiclass_11c (4716, 280) harmonized_donor_sex (4157, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide
harmonized_donor_life_stage (4036, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide
harmonized_donor_life_stage_merged (3208, 280) harmonized_sample_cancer_high (1986, 280) Graphing non-core harmonized_sample_ontology_intermediate (262, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide
harmonized_donor_sex (3990, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide
harmonized_donor_life_stage (3895, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide /home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning: invalid value encountered in true_divide
harmonized_donor_life_stage_merged (3098, 280) harmonized_sample_cancer_high (4061, 280)
track_type_pred_path = (
base_data_dir
/ "training_results"
/ "predictions"
/ "encode"
/ "track_type"
/ "split0_test_prediction_100kb_all_none_all.list.csv"
)
track_type_pred_df = pd.read_csv(track_type_pred_path)
pred_vector_cols = list(track_type_pred_df.columns[3:])
track_type_pred_df["Max_pred_track_type"] = track_type_pred_df.loc[
:, pred_vector_cols
].max(axis=1)
track_type_df = track_type_pred_df.merge(
complete_metadata_df, left_on="Unnamed: 0", right_on="FILE_accession", how="inner"
)
print(track_type_df.shape, encode_df.shape, track_type_pred_df.shape)
(9601, 219) (252, 5) (9619, 13)
# write each table in a separate excel sheet
output = track_type_pred_path.parent / "track_type_predictions_pivot.csv"
output.unlink(missing_ok=True)
verbose = False
with open(output, "a", encoding="utf8") as csv_stream:
for min_pred in [0, 0.6, 0.8]:
df = track_type_df[track_type_df["Max_pred_track_type"] >= min_pred]
pivot = df.pivot_table(
index=ASSAY,
columns="Predicted class",
values="Max_pred_track_type",
aggfunc="count",
fill_value=0,
margins=True,
).astype(int)
relative_pivot = pivot.div(pivot["All"], axis=0) * 100
csv_stream.write(f"Count Pivot - Min pred: {min_pred}\n")
pivot.to_csv(csv_stream)
csv_stream.write("\n")
csv_stream.write(f"Relative Pivot - Min pred: {min_pred}\n")
relative_pivot.to_csv(csv_stream)
csv_stream.write("\n")
if verbose:
display(pivot)
# pylint: disable=consider-using-f-string
with pd.option_context("display.float_format", "{:.2f}".format):
display(relative_pivot)
rna_df = all_preds_no_epiatlas[all_preds_no_epiatlas[ASSAY].str.contains("rna")].copy(
deep=True
)
print("RNA-Seq assay accuracy, if mrna_seq != rna_seq\n")
assay_label = f"{ASSAY}_11c"
for min_pred in [0, 0.6, 0.8]:
df = rna_df[rna_df[f"Max pred ({assay_label})"] >= min_pred]
acc = len(
df[df[f"True class ({assay_label})"] == df[f"Predicted class ({assay_label})"]]
) / len(df)
print(
f"Min pred: {min_pred}, Accuracy: {acc:.4f}. Samples: {len(df)}/{rna_df.shape[0]}\n"
)
groupby = (
df.groupby([ASSAY, f"Predicted class ({assay_label})"])
.size()
.reset_index()
.rename(columns={0: "Count"})
.sort_values([ASSAY, "Count"], ascending=False)
)
print(groupby, "\n")
RNA-Seq assay accuracy, if mrna_seq != rna_seq Min pred: 0, Accuracy: 0.9265. Samples: 1552/1552 assay_epiclass Predicted class (assay_epiclass_11c) Count 3 rna_seq rna_seq 885 2 rna_seq mrna_seq 63 0 mrna_seq mrna_seq 553 1 mrna_seq rna_seq 51 Min pred: 0.6, Accuracy: 0.9427. Samples: 1483/1552 assay_epiclass Predicted class (assay_epiclass_11c) Count 3 rna_seq rna_seq 877 2 rna_seq mrna_seq 47 0 mrna_seq mrna_seq 521 1 mrna_seq rna_seq 38 Min pred: 0.8, Accuracy: 0.9514. Samples: 1357/1552 assay_epiclass Predicted class (assay_epiclass_11c) Count 3 rna_seq rna_seq 835 2 rna_seq mrna_seq 37 0 mrna_seq mrna_seq 456 1 mrna_seq rna_seq 29